package com.algo.word;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;

import com.algo.constant.Constants;
import com.algo.dict.Dictionary;
import com.algo.dict.SCWSDictionary;

/**
 * use the output result of class NagaoAlgorithm, 
 * output the new words found
 * @author lujianfeng@miaozhen.com
 *
 */

public class NewWords {

	private class WordScore{
		private String word;
		private double score;
		private int tf;
		private boolean inDict;
		public boolean inDict(){
			return inDict;
		}
		public WordScore(String word, double score, boolean inDict, int tf){
			this.word = word;
			this.score = score;
			this.inDict = inDict;
			this.tf = tf;
		}
		public String getPOS(){
			if(inDict)	return dict.getPOS(word);
			else return dict.UNDEFINED;
		}
		public String toString(){
			return word + "\t" + getPOS() + "\t" + tf + "\n";
		}
	}
	private class WordScoreComparator implements Comparator<WordScore>{
		public int compare(WordScore w1, WordScore w2) {
			if(w1.score > w2.score) return -1;
			else if(w1.score < w2.score) return 1;
			else return 0;
		}
	}
	//initialize dictionary
	private	Dictionary dict;
	private List<WordScore> words = new ArrayList<WordScore>();
	private int windowSize;
	private int threshold;
	private NewWords(){}
	
	private double getScore(String[] wordNeighborInfoMI){
		if(wordNeighborInfoMI.length != 7)
			throw new RuntimeException("result of NagaoAlgorithm has been damaged");
		
		double[] values = new double[wordNeighborInfoMI.length-1];
		for(int i = 0; i < wordNeighborInfoMI.length - 1; i++)
			values[i] = Double.parseDouble(wordNeighborInfoMI[i+1]);
	
		return values[0]/(1.0/values[3] + 1.0/values[4])*values[5];	
		//return Math.log(values[0])/(1.0/values[3] + 1.0/values[4])*Math.log(values[5]);
	}
	
	private void getNewWords(String wordsInfoPath, Dictionary dict, String out, String[] thresholds){
		//dict = new SCWSDictionary(dictPath);
		if(thresholds.length == 2){
			windowSize = Integer.parseInt(thresholds[0]);
			threshold = Integer.parseInt(thresholds[1]);
		}
		else throw new RuntimeException("threshold must be 2 number");
		//read words information, count score
		try {
			BufferedReader br = new BufferedReader(new FileReader(wordsInfoPath));
			String line;
			while((line = br.readLine()) != null){
				String[] items = line.split(",");
				words.add(new WordScore(items[0], getScore(items), dict.inDictionary(items[0]), Integer.parseInt(items[1])));
			}
			br.close();
		} catch (IOException e) {
			throw new RuntimeException(e);
		}
		//sort words according to score
		Collections.sort(words, new WordScoreComparator());
		//output top new words
		LinkedList<WordScore> queue = new LinkedList<WordScore>();
		int newWordsNumber = 0;		//count new words' number in consecutive 50 words
		try {
			BufferedWriter bw = new BufferedWriter(new FileWriter(out));
			for(WordScore word : words){
				if(queue.size() >= windowSize){
					WordScore head = queue.removeFirst();
					if(!head.inDict())	newWordsNumber--;
				}
				queue.addLast(word);
				if(!word.inDict){
					bw.write(word.toString());
					newWordsNumber++;
				}
				if(newWordsNumber > threshold)	break;
				//bw.write(word.toString());
			}
			bw.close();
		} catch (IOException e) {
			throw new RuntimeException(e);
		}
	}
	
	public static void findNewWords(String wordsInfoPath, Dictionary dict, String out){
		NewWords nw = new NewWords();
		nw.getNewWords(wordsInfoPath, dict, out, Constants.NEWWORDSFILTER.split(","));
	}
	
	public static void findNewWords(String wordsInfoPath, Dictionary dict, String out, String[] thresholds){
		NewWords nw = new NewWords();
		nw.getNewWords(wordsInfoPath, dict, out, thresholds);
	}
	
	public static void findNewWords(String dataDir, String wordsInfoPath, Dictionary dict, String newWords, String filter){
		
		List<String> files = getAllFiles(new File(dataDir));
		String[] paths = (String[]) files.toArray(new String[files.size()]);
		NagaoAlgorithm.applyNagao(paths, wordsInfoPath, Constants.SPLITWORDSPATH, Constants.NGRAM, Constants.NAGAOFILTER);
		
		NewWords nw = new NewWords();
		nw.getNewWords(wordsInfoPath, dict, newWords, filter.split(","));
	}
	
	private static List<String> getAllFiles(File dirOrFile){
		List<String> files = new ArrayList<String>();
		if(dirOrFile.isFile())
			files.add(dirOrFile.getAbsolutePath());
		else{
			for(File subPath : dirOrFile.listFiles())
				files.addAll(getAllFiles(subPath));
		}
		return files;
	}
	
	public static void main(String[] args){
		//for test
		//findNewWords("E://test//title", "E://test//titlewords", new SCWSDictionary("E://test//dict.utf8"), "E://test//newwords", Constants.NEWWORDSFILTER);
		
		//use default SCWS dictionary, and default new words filter
		if(args.length == 3)
			findNewWords(args[0], args[1], new SCWSDictionary(Constants.SCWSDICTIONARY), args[2], Constants.NEWWORDSFILTER);
	}
	
}
